﻿using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using Stemming;

namespace Preprocess2
{
    class Program
    {
        static void Main(string[] args)
        {
            string textPath = @"..\..\..\..\Text Corpus\";

            DirectoryInfo di = new DirectoryInfo(textPath + @"\Pure Text");
            FileInfo[] rgFiles = di.GetFiles("*.txt");
            HashSet<string> vocabulary = new HashSet<string>();
            string[][] docwords = new string[rgFiles.Length][];
            for(int m=0;m<rgFiles.Length;m++)
            {
                FileInfo fi = rgFiles[m];
                Console.WriteLine("Reading {0}", fi.Name);
                StopWordsHandler swh = new StopWordsHandler();
                StreamReader sr = new StreamReader(fi.FullName);
                string doc = sr.ReadToEnd();
                Tokeniser tk = new Tokeniser();
                docwords[m] = tk.Partition(doc);
                //PorterStemmer stem = new PorterStemmer();
                for (int n = 0; n < docwords[m].Length; n++)
                {
                    //string tmp = docwords[m][n];
                    //docwords[m][n] = stem.stemTerm(docwords[m][n]);
                    //Console.WriteLine("{0}\t{1}", tmp, docwords[m][n]);
                    vocabulary.Add(docwords[m][n]);
                }
                sr.Close();
            }

            Dictionary<string, int> dict = new Dictionary<string, int>();
            int index = 0;
            StreamWriter sw = new StreamWriter(textPath + @"\Index\vocab.txt");
            foreach (string word in vocabulary)
            {
                dict[word] = index;
                sw.WriteLine("{0}\t{1}", index, word);
                index++;
            }
            sw.Close();
            for (int m = 0; m < rgFiles.Length; m++)
            {
                FileInfo fi = rgFiles[m];
                string[] fileTokens = fi.Name.Split('.');
                Console.WriteLine("Writing {0}.index.txt", fileTokens[0]);
                sw = new StreamWriter(textPath + @"\Index\" + fileTokens[0] + ".index.txt");
                for (int n = 0; n < docwords[m].Length; n++)
                {
                    sw.WriteLine("{0}\t{1}", dict[docwords[m][n]], docwords[m][n]);
                }
                sw.Close();
            }

            Console.WriteLine("Finish Parsing, {0} Vocabulary", vocabulary.Count());
            Console.ReadKey();
        }
    }
}
